#This is an example of two variables where # we think that there may be a linear relation # between the values in L1 and the values in L2 source("../gnrnd4.R") gnrnd4(489031506,3110420504,1900005) # look at the values L1 L2 # it might be easier if we remember that the L1 # values are our x values x <- L1 # the L2 values are our y values. Later we will have # some other y-values so let us call these the # y_observed values y_observed <- L2 # it is hard to see a relation just by looking at the # values. Perhaps we should look at the graph of the two plot( x, y_observed ) # Now, just for fun, find the mean value for each variable x_bar <- mean( x ) x_bar y_observed_bar <- mean( y_observed ) y_observed_bar # and let us add that to the graph points( x_bar, y_observed_bar, pch=19) # # actually, just so that we can get a better picture # let us at least try to make the graph somewhat # square and let us get the origin into the graph. plot( x, y_observed, xlim=c(-5,45), ylim=c(-5,45), las=1, xaxp=c(-5,45, 10), yaxp=c(-5,45, 10)) points( x_bar, y_observed_bar, pch=19) abline(h=seq(-5,45,5), v=seq(-5,45,5), lty="dotted", col="gray") abline(h=0, v=0) # It looks like a good line that approximates the # plotted points would be one going through # the points (0,5) and (25, 45). From our algebra # background that would be a line with y-intercept # (0,5) and a slope of 40/25=8/5. Let us draw this # line on the graph. abline(5, 8/5, col="red" ) # # How good of a fit is that line? # To answer this we need to find the values on # the line that correspond to each of our x values. # We will call them the y_expected values. y_expected <- 5 + (8/5)*x # And we could look at the values y_expected # And we could even plot the values points(x, y_expected, pch=17, col="red") # This is a bit too crowded. Let us go back to a # closer view of the same stuff plot( x, y_observed, xlim=c(10,26), ylim=c(20,44), las=1, xaxp=c(10,26, 8), yaxp=c(20,44, 12)) points( x_bar, y_observed_bar, pch=19) abline(h=seq(20,46,2), v=seq(10,26,2), lty="dotted", col="gray") abline(5, 8/5, col="red" ) points(x, y_expected, pch=17, col="red") # # To get a measure of the fit we first find all of the # observed - expected values x # just to see them again y_observed # just to see them again y_expected # just to see them again difference <- y_observed - y_expected difference # Then get the squares of those values diff_sqr <- difference^2 diff_sqr # Now get the sum of those squared values sum( diff_sqr ) # let us try a different line, and we will see if # it is a better fit y_expected_2 <- 10 + 1.25*x y_expected_2 abline( 10, 1.25, col="green") difference <- y_observed - y_expected_2 difference # and we can get the sum of the differences squared sum( difference^2) # that sum is less than the first line so # this line is a better fit. # # Now the challenge is to find the best line, # the line with the best fit possible. # R has a function to do that , the lm() function lm( y_observed ~ x ) # plot that line abline( 9.127, 1.340, col="black") # let us see the "goodness" of that model y_expected_3 <- 9.127 + 1.340*x difference <- y_observed -y_expected_3 sum( difference^2) # # By the way, the differences, observed - expected, # are called the residuals. We want to have the # graph of the x values and the residuals to # be scattered all over the place plot( x, difference) # let us return to our big graph and then # graph the regression equation plot( x, y_observed, xlim=c(-5,45), ylim=c(-5,45), las=1, xaxp=c(-5,45, 10), yaxp=c(-5,45, 10)) points( x_bar, y_observed_bar, pch=19) abline(h=seq(-5,45,5), v=seq(-5,45,5), lty="dotted", col="gray") abline(h=0, v=0) abline( 9.127, 1.340, col="black") # Note that the regression equation goes through our # plot of the point( mean(x), mean(y) )